/*************************************************************************
 * The contents of this file are subject to the MYRICOM MYRINET          *
 * EXPRESS (MX) NETWORKING SOFTWARE AND DOCUMENTATION LICENSE (the       *
 * "License"); User may not use this file except in compliance with the  *
 * License.  The full text of the License can found in LICENSE.TXT       *
 *                                                                       *
 * Software distributed under the License is distributed on an "AS IS"   *
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied.  See  *
 * the License for the specific language governing rights and            *
 * limitations under the License.                                        *
 *                                                                       *
 * Copyright 2003 - 2004 by Myricom, Inc.  All rights reserved.          *
 *************************************************************************/

/* modifications for MX kernel lib made by
 * Brice.Goglin@ens-lyon.org (LIP/INRIA/ENS-Lyon) */

static const char __idstring[] = "@(#)$Id: mx.c,v 1.290 2006/12/14 07:38:59 loic Exp $";

#define mx_printf printk
#include "mx_arch.h"
#include "mx_misc.h"
#include "mx_instance.h"
#include "mx_malloc.h"
#include "mx_dma_map.h"
#include "mx_peer.h"
#include "mx_version.h"
#include <linux/random.h>
#include <linux/timer.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/kmod.h>
#include <linux/poll.h>

#if MX_KERNEL_LIB
int mx_init_klib(void);
void mx_finalize_klib(void);
#endif

#if (MX_CPU_powerpc64 || MX_CPU_powerpc)
#include <asm/pci-bridge.h>
#endif
#if LINUX_XX >= 26
#include <linux/cdev.h>
static mx_class_t *mx_class;
#endif

#if MX_CPU_powerpc64
#ifdef CONFIG_PPC_ISERIES
#error iseries not supported
#endif
#if LINUX_XX >= 26
#include <asm/iommu.h>
typedef struct iommu_table *mx_ppc64_iommu_t;
#ifndef PCI_DN
#define PCI_DN(dn) (dn)
#endif
#ifndef PCI_GET_DN
#define PCI_GET_DN(dev) ((struct device_node *)((dev)->sysdata))
#endif
#define MX_PPC64_IOMMU(dev) (PCI_DN(PCI_GET_DN(dev))->iommu_table)
#ifdef HAVE_IT_MAPSIZE
#define MX_PPC64_IOMMU_SIZE(iommu) ((iommu)->it_mapsize)
#else
#define MX_PPC64_IOMMU_SIZE(iommu) ((iommu)->it_size)
#endif /* HAVE_IT_MAPSIZE */
#else /* LINUX_XX >= 26 */
typedef struct TceTable *mx_ppc64_iommu_t;
#define MX_PPC64_IOMMU(dev) (((struct device_node *)((dev)->sysdata))->tce_table)
#define MX_PPC64_IOMMU_SIZE(tbl) (tbl->size*(PAGE_SIZE/sizeof(union Tce)))
#include <asm/prom.h>
#include <asm/pci_dma.h>
#endif 	/* LINUX_XX >= 26 */
#endif	/* MX_CPU_powerpc64 */


#define MX_MAJOR 220

MODULE_AUTHOR("Maintainer: help@myri.com");
MODULE_DESCRIPTION("Myrinet Express (MX) driver");
#if LINUX_XX >= 26
MODULE_VERSION(MX_VERSION_STR);
#endif

unsigned long mx_activate_page_symbol = 0;
unsigned long mx_sprintf_symbol = 0;
static char *mx_mapper_path;
static unsigned mx_udev = 0;
#if LINUX_XX <= 24
devfs_handle_t mx_devfs_handle[2]; /* for mxctl and mxctlp */
#endif
unsigned mx_ether_rx_frags = 0;
unsigned mx_ether_csum = 1;
unsigned mx_msi = -1; /* auto=-1  on=1 off=0 */
unsigned mx_ecrc = 1;
int mx_bus = -1;
unsigned mx_bh_intr = 0;
unsigned mx_bar64_loc = 32;
unsigned mx_std_uc = 0;

static unsigned mx_page_pat_attr = 0;

module_param(mx_debug_mask, int, S_IRUGO | S_IWUSR);
module_param(mx_max_instance, int, S_IRUGO);
module_param(mx_max_nodes, int, S_IRUGO);
module_param(mx_max_endpoints, int, S_IRUGO);
module_param(mx_max_send_handles, int, S_IRUGO);
module_param(mx_max_rdma_windows, int, S_IRUGO);
module_param(mx_small_message_threshold, int, S_IRUGO);
module_param(mx_medium_message_threshold, int, S_IRUGO);
module_param(mx_security_disabled, int, S_IRUGO);
module_param(mx_intr_coal_delay, int, S_IRUGO);

module_param(mx_activate_page_symbol, ulong, S_IRUGO);
module_param(mx_sprintf_symbol, ulong, S_IRUGO);

module_param(mx_mapper_path, charp, S_IRUGO);
module_param(mx_override_e_to_f, int, S_IRUGO);
module_param(mx_udev, int, S_IRUGO);
module_param(mx_ether_rx_frags, int, S_IRUGO);
module_param(mx_msi, int, S_IRUGO);
module_param(mx_ecrc, int, S_IRUGO);
module_param(mx_bus, int, S_IRUGO);
module_param(mx_bh_intr, int, S_IRUGO);
module_param(mx_z_loopback, int, S_IRUGO);
module_param(mx_ether_csum, int, S_IRUGO);
module_param(mx_parity_recovery, int, S_IRUGO | S_IWUSR);
module_param(mx_recover_from_all_errors, int, S_IRUGO | S_IWUSR);
module_param(mx_max_host_queries, int, S_IRUGO | S_IWUSR);
module_param(mx_pcie_down, int, S_IRUGO);
module_param(mx_bar64_loc, int, S_IRUGO);
module_param(mx_std_uc, int, S_IRUGO);
module_param(mx_pcie_down_on_error, int, S_IRUGO | S_IWUSR);

static spinlock_t mx_pin_lock = SPIN_LOCK_UNLOCKED;
spinlock_t mx_print_lock = SPIN_LOCK_UNLOCKED;

#define MX_USE_WATCHDOG_THREAD 1

#if MX_USE_WATCHDOG_THREAD
static struct completion mx_watchdog_completion;
wait_queue_head_t mx_watchdog_queue;
#else
static struct timer_list mx_watchdog_timer;
static struct mx_lxx_work_struct mx_watchdog_work;
#endif
static int mx_module_is_exiting;

#ifndef HAVE_UNLOCKED_IOCTL
#define unlocked_ioctl ioctl
#define mx_ioctl_return_t int
#define MX_INODE_ARG struct inode *inode,
#else
#define mx_ioctl_return_t long
#define MX_INODE_ARG
#endif

static unsigned long mx_pci_dev_base(struct pci_dev *dev, int bar);
ssize_t mx_read (struct file*, char*, size_t, loff_t*);
ssize_t mx_write (struct file*, const char*, size_t, loff_t*);
mx_ioctl_return_t mx_ioctl (MX_INODE_ARG struct file*, unsigned int,
			    unsigned long);
int mx_open (struct inode*, struct file*);
int mx_release (struct inode*, struct file*);
int mx_mmap (struct file*, struct vm_area_struct*);
unsigned int mx_poll(struct file *filp, poll_table *wait);


struct file_operations mx_fops = {
  read: mx_read,
  write: mx_write,
  unlocked_ioctl: mx_ioctl,
#ifdef HAVE_COMPAT_IOCTL
  compat_ioctl: mx_ioctl,
#endif
  open: mx_open,
  release: mx_release,
  mmap: mx_mmap,
  poll: mx_poll,
  owner: THIS_MODULE
};

#ifdef CONFIG_PCI_MSI

int mx_hyper_msi_cap_on(struct pci_dev *pdev)
{
  uint8_t cap_off;
  int nbcap = 0;
  
  cap_off = PCI_CAPABILITY_LIST - 1;
  /* go through all caps looking for a hypertransport msi mapping */
  while (pci_read_config_byte(pdev, cap_off + 1, &cap_off) == 0 &&
	 nbcap++ <= 256/4) {
    uint32_t cap_hdr;
    if (cap_off == 0 || cap_off == 0xff)
      break;
    cap_off &= 0xfc;
    /* cf hypertransport spec, msi mapping section */
    if (pci_read_config_dword(pdev, cap_off, &cap_hdr) == 0
	&& (cap_hdr & 0xff) == 8 /* hypertransport cap */
	&& (cap_hdr & 0xf8000000) == 0xa8000000 /* msi mapping */
	&& (cap_hdr & 0x10000) /* msi mapping cap enabled */) {
      /* MSI present and enabled */
      return 1;
    }
  }
  return 0;
}

static int
mx_use_msi(struct pci_dev *pdev)
{
  if (mx_msi == 1 || mx_msi == 0)
    return mx_msi;
  
  /*  find root complex for our device */
  while (pdev->bus && pdev->bus->self) {
    pdev = pdev->bus->self;
  }
  /* go for it if chipset is intel, or has hypertransport msi cap */
  if (pdev->vendor == PCI_VENDOR_ID_INTEL 
      || mx_hyper_msi_cap_on(pdev))
    return 1;
  
  /*  check if main chipset device has hypertransport msi cap */
  pdev = pci_find_slot(pdev->bus->number, 0);
  if (pdev && mx_hyper_msi_cap_on(pdev))
    return 1;
  
  /* default off */
  return 0;
}
#endif /* CONFIG_PCI_MSI */

static uint32_t *
mx_mmio_ext_config(struct pci_dev *dev, int where)
{
  unsigned long base = 0;
  uint32_t *ptr32 = 0;
  uint32_t pci_id;

  /* we harcode the base address where ext-conf-space is available
     for a few popular nvidia chipsets */
  if (dev->vendor == 0x10de) {
    if (dev->device == 0x005d)
      base = 0xe0000000UL;
    else if (dev->device >= 0x0374 && dev->device <= 0x0378)
      base = 0xf0000000UL;
  }
  if (!base)
    return NULL;

  base += ((unsigned long)dev->bus->number * 0x00100000UL
	   + (unsigned long)dev->devfn * 0x00001000UL);
  ptr32 = (uint32_t *) ioremap(base, 4);
  if (!ptr32)
    return NULL;

  pci_id = *ptr32;
  iounmap(ptr32);
  if (pci_id != dev->vendor + (dev->device << 16)) {
    MX_WARN(("%s: Ext-conf-space at unknown address, contact help@myri.com\n",
	     mx_pci_name(dev)));
    return NULL;
  }
  ptr32 = ioremap(base + where, 4);
  return ptr32;
}

static int
mx_read_ext_config_dword(struct pci_dev *dev, int where, u32 *val)
{
	uint32_t *ptr32;
	int status = pci_read_config_dword(dev, where, val);
	if (status && (ptr32 = mx_mmio_ext_config(dev, where))) {
		*val = *ptr32;
		iounmap(ptr32);
		return 0;
	}
	return status;
}

static int
mx_write_ext_config_dword(struct pci_dev *dev, int where, u32 val)
{
	uint32_t *ptr32;
	int status = pci_write_config_dword(dev, where, val);
	if (status &&  (ptr32 = mx_mmio_ext_config(dev, where))) {
		*ptr32 = val;
		iounmap(ptr32);
		return 0;
	}
	return status;
}

static int
mx_find_ext_capability(struct pci_dev *dev, unsigned cap_id)
{
  unsigned cap = 0x100;
  int nbcap = 0;

  while (cap >= 0x100 && nbcap++ < 512) {
    uint32_t dw;
    if (mx_read_ext_config_dword(dev, cap, &dw) != 0)
      break;
     if ((dw & 0xffff) == cap_id)
      return cap;
    cap = dw >> 20;
  }
  return 0;
}

static void
mx_pcie_bridge_conf(struct pci_dev *bridge)
{
  unsigned cap;
  unsigned err_cap;
  int ret;
  uint16_t cmd;
  uint32_t dw;

  pci_read_config_word(bridge, PCI_COMMAND, &cmd);
  pci_write_config_word(bridge, PCI_COMMAND, cmd & ~PCI_COMMAND_SERR);
  cap = pci_find_capability(bridge, PCI_CAP_ID_EXP);
  if (cap) {
    pci_read_config_word(bridge, cap + PCI_EXP_DEVCTL, &cmd);
    pci_write_config_word(bridge, cap + PCI_EXP_DEVCTL, cmd & ~0xf /* ~(ce|fe|nfe|ur)*/);
  }
  /* e7x20 have a pcie-surprise-down like bit in the "unit error regs", mask at 0x144 bit 19 */
  if (bridge->vendor == 0x8086
      && bridge->device >= 0x3595
      && bridge->device <= 0x359a
      && pci_read_config_dword(bridge, 0x144, &dw) == 0)
    pci_write_config_dword(bridge, 0x144, dw | 0x800);


  if (!mx_ecrc)
    return;
  
  cap = mx_find_ext_capability(bridge, PCI_EXT_CAP_ID_ERR);
  /* nvidia ext cap is not always linked in ext cap chain */
  if (!cap
      && bridge->vendor == 0x10de /* nvidia */
      && ((bridge->device == 0x005d /* ck804_pcie */
	   || (bridge->device >= 0x0374 /* nforce-mcp55 */
	       && bridge->device <= 0x0378 /* nforce-mcp55 */))))
    cap = 0x160;
  
  if (!cap)
    return;
  
  ret = mx_read_ext_config_dword(bridge, cap + PCI_ERR_CAP, &err_cap);
  if (ret) {
    MX_INFO(("failed reading ext-conf-space of %s\n", mx_pci_name(bridge)));
    MX_INFO(("\t pci=nommconf in use? or buggy/incomplete/absent acpi MCFG attr?\n"));
    return;
  }
  if (!(err_cap & PCI_ERR_CAP_ECRC_GENC))
    return;
  
  err_cap |= PCI_ERR_CAP_ECRC_GENE;
  mx_write_ext_config_dword(bridge, cap + PCI_ERR_CAP, err_cap);
  MX_INFO(("Enabling ECRC on upstream bridge %s\n",
	   mx_pci_name(bridge)));
} 


static void
mx_bar64_reloc(struct pci_dev * dev)
{
#if MX_CPU_x86_64
  struct pci_dev *bridge = dev->bus->self;
  uint32_t dw;
  uint16_t w;
  uint64_t base, limit;

  if (dev->device != 8 || !bridge
      || bridge->vendor != PCI_VENDOR_ID_INTEL
      || bridge->device < 0x3595
      || bridge->device > 0x359a)
    return;
  
  if (pci_resource_start(dev,0) >= (1ULL << 32))
    return;
  
  if (mx_bar64_loc < (MX_LINUX_PFN_MAX >> 20)) {
    MX_INFO(("Cannot shift Bar0 to %dGB, max_mem=%ldGB\n", 
	     mx_bar64_loc, MX_LINUX_PFN_MAX >> 20));
    return;
  }
#if 0
  if (list_entry(&dev->bus->devices, struct pci_dev, bus_list) != dev ||
      dev->bus_list->next != &dev->bus_devices) {
    MX_INFO(("Cannot shift Bar0, not alone on bus\n"));
  }
#endif
  pci_read_config_word(bridge, PCI_PREF_MEMORY_BASE, &w);
  base = (w & ~0xfU) << 16;
  pci_read_config_word(bridge, PCI_PREF_MEMORY_LIMIT, &w);
  limit = ((w & ~0xfU) << 16) + 0x100000U;
  pci_read_config_dword(bridge, PCI_PREF_BASE_UPPER32, &dw);
  base += (uint64_t)dw << 32;
  pci_read_config_dword(bridge, PCI_PREF_LIMIT_UPPER32, &dw);
  limit += (uint64_t)dw << 32;
  if (base != pci_resource_start(dev,0) 
      || limit != pci_resource_start(dev,0) + pci_resource_len(dev,0)) {
    printk("%s: prefetch  space [0x%llx,0x%llx[ from %s is not exactly our bar\n", 
	   pci_name(dev), base, limit, pci_name(bridge));
    return;
  }
  if (mx_bar64_loc < 4 || mx_bar64_loc & 3) {
    MX_INFO(("Invalid value for mx_bar64_loc:0x%x\n", mx_bar64_loc));
    return;
  }
  pci_write_config_dword(dev, PCI_BASE_ADDRESS_1, mx_bar64_loc / 4);
  pci_write_config_dword(bridge, PCI_PREF_BASE_UPPER32, mx_bar64_loc / 4);
  pci_write_config_dword(bridge, PCI_PREF_LIMIT_UPPER32, mx_bar64_loc / 4);
  pci_resource_start(dev,0) += mx_bar64_loc * (1ULL << 30);
  pci_resource_end(dev,0) += mx_bar64_loc * (1ULL << 30);
  MX_INFO(("Moved bar0 to %d GB\n", mx_bar64_loc));
  return;
#endif
}


static struct vm_operations_struct mx_vm_ops;

void
mx_spin(uint32_t usecs)
{
  if (usecs == 0 && !in_interrupt ()) {
      /* if usecs == 0, just do the sched_yield() equivalent */
    schedule();
  } else if (usecs < 100 || in_interrupt ()) {
    udelay (usecs);
  } else {
    /* do not want to call udelay for long time. */
    /* let be uninterruptible to be sure the delay is respected */
    set_current_state(TASK_UNINTERRUPTIBLE);
    schedule_timeout (((usecs * HZ) / 1000000) + 1);
  }
}

/*
 * find the physical address of either a kernel page or a user pager
 * by walking the page tables (see Rubini p.287)
 *
 * NOTE: cannot be called from interrupt handler for user-space
 * addresses since we use the current MM context
 */

static inline pgd_t *
mx_pgd_offset (unsigned long addr, int kernel, struct mm_struct * mm)
{
  if (kernel) {
      /* get kernel page table */
      return  (pgd_offset_k(addr));
  } else {
    if (mm) {
      /* use mm to access process page table */
      return (pgd_offset(mm, addr));
    } else {
      /* use current->mm to access current process page table */
      return (pgd_offset(current->mm, addr));
    }
  }
}


static inline int
mx_page_is_valid(struct page *page, pte_t *pte)
{
#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
  return VALID_PAGE(page);
#else
  unsigned long pfn = pte_pfn(*pte);
  return pfn_valid(pfn);
#endif

}

static struct page *
mx_kva_to_page (mx_instance_state_t * is,
		unsigned long addr, int kernel,
		struct mm_struct *mm)
{
  pgd_t *pgd;
#ifdef PUD_SHIFT
  pud_t *pud;
#else
  pgd_t *pud;
#endif
  pmd_t *pmd;
  pte_t *pte;
  struct page *page;
  void *ptr;
  int valid;

  MX_DEBUG_PRINT (MX_DEBUG_KVA_TO_PHYS, 
		  ("%s: (0x%lx, %d)\n", __FUNCTION__, addr, kernel));

  if (kernel)
    {
      MX_DEBUG_PRINT (MX_DEBUG_KVA_TO_PHYS, ("kernel\n"));
      /* if kernel:
         if vaddr in low range, conversion is done by translation (most cases).
         if vaddr after high_memory (vmalloc), we deal with the segment offset
         via VMALLOC_VMADDR */

      ptr = (void *) addr;
      if ((addr >= PAGE_OFFSET) && (addr < (unsigned long) high_memory))
	{
	  MX_DEBUG_PRINT (MX_DEBUG_KVA_TO_PHYS, ("low\n"));
	  return virt_to_page (ptr);
	}

      /* beware, some variables are not exported on ppc 2.2.x */
      mx_assert (addr >= VMALLOC_START && addr < VMALLOC_END);

    }
  
  MX_DEBUG_PRINT (MX_DEBUG_KVA_TO_PHYS, ("out of kernel\n"));
  
  pgd = mx_pgd_offset(addr, kernel, mm);
  if (!pgd)
    {
      return (0);
    }

  /* first level */
  if (pgd_none (*pgd))
    {
      return (0);
    }
  if (pgd_bad (*pgd))
    {
      MX_WARN (("mx_kvirt_to_page: bad PGD\n"));
      return (0);
    }

  /* second level */
#ifdef PUD_SHIFT
  pud = pud_offset (pgd, addr);

  if (pud_none (*pud))
    {
      return (0);
    }
  if (!pud_present (*pud))
    {
      return (0);
    }
  if (pud_bad (*pud))
    {
      MX_WARN (("mx_kvirt_to_page: bad PUD\n"));
      return (0);
    }
#else
  pud = pgd;
#endif

  pmd = pmd_offset (pud, addr);

  if (pmd_none (*pmd))
    {
      return (0);
    }
  if (!pmd_present (*pmd))
    {
      return (0);
    }
  if (pmd_bad (*pmd))
    {
      MX_WARN (("mx_kvirt_to_page: bad PMD\n"));
      return (0);
    }

  /* last level */
  pte = pte_offset_map (pmd, addr);
  if (pte_none (*pte))
    {
      goto pte_error;
    }
  if (!pte_present (*pte))
    {
      goto pte_error;
    }
  if (!pte_write (*pte))
    {
      goto pte_error;
    }

  page = pte_page(*pte);
  valid = mx_page_is_valid(page, pte);
  if (!valid)
    printk("page is not valid\n");
  pte_unmap (pte);
  return valid ? page : 0;

 pte_error:
  pte_unmap(pte);
  return 0;
}

ssize_t
mx_read(struct file* filp, char* buff, size_t count, loff_t* offp)
{
  int len = 0;
  int status = 0;
  int minr;
  int unit;
  char *c = 0;
  struct inode *inode = filp->f_dentry->d_inode;

  minr = minor(inode->i_rdev);
  if (minr == MX_CTL || minr == MX_CTLP) {
    return 0;
  }

  unit = minor(inode->i_rdev) / 2;

  status = mx_instance_status_string(unit, &c, &len);
  if (status)
    return -status;

  if (*offp > len)
    goto abort_with_c;

  if (*offp + count > len)
    count = len - *offp;

  status = copy_to_user(buff, c + *offp, count);
  if (status) {
    status = -EFAULT;
    goto abort_with_c;
  }
  *offp += count;
  status = count;

 abort_with_c:
  mx_kfree(c);
  return status;  
}

ssize_t
mx_write(struct file* filp, const char* buff, size_t count,
	 loff_t* offp)
{
  return 0;
}

unsigned int
mx_poll(struct file *filp, poll_table *wait)
{
  int minr;
  struct inode *inode = filp->f_dentry->d_inode;
  mx_endpt_state_t *es;
  mx_instance_state_t *is;
  unsigned int mask = 0;
  unsigned long flags;

  minr = minor(inode->i_rdev);
  if (minr == MX_CTL || minr == MX_CTLP)
    return POLLERR;

  es = filp->private_data;
  if (es->flags != MX_ES_RAW)
    return POLLERR;

  is = es->is;

  /* check to see if the mcp died, since the raw endpoint opener
     will want to know about it */
  if (mx_is_dead(is))
    return POLLERR;

  poll_wait(filp, &is->raw.sync.sleep_queue, wait);

  mx_spin_lock_irqsave(&is->raw.spinlock, flags);
  if (STAILQ_FIRST(&is->raw.pending_events))
    mask |= POLLIN;
  else
    is->raw.wakeup_needed = 1;
  mx_spin_unlock_irqrestore(&is->raw.spinlock, flags);

  return mask;
}

static int
mx_set_endpoint (struct file* filp,  unsigned int cmd, 
		 mx_uaddr_t arg, int raw)
{
  mx_endpt_state_t *es;
  mx_set_endpt_t set_endpt;
  int unit;
  int status;
  size_t len;
  
  if (cmd != MX_SET_ENDPOINT && cmd != MX_SET_RAW)
    return EINVAL;

  /* unit bounds checking was done in open, and will be 
     done again in mx_common_open */
  unit = minor(filp->f_dentry->d_inode->i_rdev) / 2;

  if (!raw) {
    status = mx_arch_copyin(arg, &set_endpt, sizeof(set_endpt));
    if (status)
      return EFAULT;
    if (set_endpt.endpoint < 0 || set_endpt.endpoint >= mx_max_endpoints)
      return ERANGE;
  }
  
  es = mx_kmalloc(sizeof(*es), MX_MZERO|MX_WAITOK);
  if (es == 0)
    return ENOMEM;

  es->privileged = minor(filp->f_dentry->d_inode->i_rdev) & 1;
  es->is_kernel = 0;
  es->opener.pid = mx_kgetpid();
  if (sizeof(current->comm) > sizeof(es->opener.comm))
    len = sizeof(es->opener.comm);
  else
    len = sizeof(current->comm);
  bcopy(current->comm, es->opener.comm, len);

  status = mx_common_open(unit, set_endpt.endpoint, es, raw);
  
  if (status != 0) {
    mx_kfree(es);
    return (status);
  }
  set_endpt.session_id = es->session_id;
  if (!raw)
    status = mx_arch_copyout(&set_endpt, arg, sizeof(set_endpt));

  filp->private_data = es;

  MX_DEBUG_PRINT (MX_DEBUG_OPENCLOSE, 
		  ("Board %d, endpoint %d opened\n", 
		   unit, set_endpt.endpoint));

  return status;
}

mx_ioctl_return_t
mx_ioctl (MX_INODE_ARG struct file* filp, unsigned int cmd,
            unsigned long arg)
{
  mx_endpt_state_t *es;
  int retval;
  int privileged;
 
  privileged = minor(filp->f_dentry->d_inode->i_rdev) & 1;
  if (filp->private_data == 0) {
    switch (cmd) {
    case MX_SET_ENDPOINT:    
      retval = mx_set_endpoint(filp, cmd, (mx_uaddr_t)arg, 0);
      break;
    case MX_SET_RAW:
      retval = mx_set_endpoint(filp, cmd, (mx_uaddr_t)arg, 1);
      break;
    default:
      retval = mx_endptless_ioctl(cmd, (mx_uaddr_t)arg, privileged, 0);
    }
    goto done;
  }
  es = filp->private_data;

  mx_mutex_enter(&es->sync);
  es->ref_count++;
  mx_mutex_exit(&es->sync);  

  retval = mx_common_ioctl(es, cmd, (mx_uaddr_t)arg);
  if (retval == ENOTTY) {
    retval = mx_endptless_ioctl(cmd, (mx_uaddr_t)arg, privileged, 0);
  }

  mx_mutex_enter(&es->sync);
  es->ref_count--;
  mx_mutex_exit(&es->sync);  

 done:

  return(-1 * retval);

}

int
mx_open (struct inode* inode, struct file* filp)
{
  int minr;
  int unit;

  minr = minor(inode->i_rdev);
  unit = minor(inode->i_rdev) / 2;
  if (unit >= mx_max_instance &&
      minr != MX_CTL &&
      minr != MX_CTLP)
    return -ENODEV;
  
  /* unit in bounds, open will be finished in ioctl */
  filp->private_data = 0;
  MX_MOD_INC_USE_COUNT;
  return 0;
}

int
mx_release (struct inode* inode, struct file* filp)
{
  mx_endpt_state_t *es;

  es = filp->private_data;
  filp->private_data = 0;

  if (!es) { /* endpoint was never fully opened, just return 0 */
    MX_MOD_DEC_USE_COUNT;
    return 0;
  }

  mx_common_close(es);
  mx_kfree(es);
  MX_MOD_DEC_USE_COUNT;
  return 0;
}

void
mx_reserve_page(void *kva)
{
  struct page * page = mx_kva_to_page(kva, (unsigned long)kva, 1, NULL);
  set_bit(PG_reserved,&page->flags);
}

void
mx_unreserve_page(void *kva)
{
  struct page * page = mx_kva_to_page(kva, (unsigned long)kva, 1, NULL);
  clear_bit(PG_reserved,&page->flags);
}

int
mx_mmap (struct file* filp, struct vm_area_struct* vma)
{
  mx_endpt_state_t *es;
  mx_instance_state_t *is;
  unsigned long start, end, pos, off, len;
  uint64_t phys;
  pgprot_t prot;
  struct page * page;
  void *kva;
  mx_page_pin_t *dontcare;
  int status;
  int mem_type;

  es = (mx_endpt_state_t *)filp->private_data;
  if (es == NULL)
    return -1;
  is = es->is;

  mx_mutex_enter(&es->sync);
  vma->vm_private_data = es;
  vma->vm_ops = &mx_vm_ops;
  vma->vm_flags |= VM_IO;

  start = vma->vm_start;
  end = vma->vm_end;
  off = vma->vm_pgoff << PAGE_SHIFT;
  len = end - start;
  prot = vma->vm_page_prot;

  /*
   * determine the mem type for this request
   */ 
  kva = NULL;
  status = mx_mmap_off_to_kva(es, off, &kva, &mem_type, &dontcare);
  if (status != 0) {
    MX_DEBUG_PRINT 
      (MX_DEBUG_KVA_TO_PHYS,
       ("status = %d, len = 0x%lx\n", status, len));
    goto abort_with_mutex;
  }

  if (mem_type == MX_MEM_HOSTMEM) {
    /*
     * loop to map all kernel pages (non-contiguous)
     */
    for (pos = 0; pos < len; pos += PAGE_SIZE) {
      /*
       * determine the kva for this request
       */
      kva = (void*)pos;
      status = mx_mmap_off_to_kva(es, off + pos, &kva, &mem_type, &dontcare);
      if (status != 0) {
	MX_DEBUG_PRINT 
	  (MX_DEBUG_KVA_TO_PHYS,
	   ("status = %d, pos = 0x%lx, len = 0x%lx\n", status, pos, len));
	goto abort_with_mutex;
      }
      /* 
       * remap_page_range needs a physical address
       */
      page = mx_kva_to_page(es->is, (unsigned long)kva, 1, NULL);
      phys = page_to_pfn(page) << PAGE_SHIFT;
      if (mx_remap_pfn_range(vma, start + pos, mx_linux_pfn(phys), PAGE_SIZE, prot)) {
	MX_WARN (("mx_remap_pfn_range failed: 0x%lx, 0x%08x%08x, 0x%lx\n",
		  (start + pos), MX_HIGHPART_TO_U32(phys),
		  MX_LOWPART_TO_U32(phys),
		  (unsigned long)PAGE_SIZE));
	goto abort_with_mutex;
      }
    }
  } else {
    int spec_bar = 0;
    unsigned long phys_base;

    if (is->board_type == MX_BOARD_TYPE_Z && mem_type == MX_MEM_SPECIAL)
      spec_bar = 2;
    phys_base = MX_LINUX_IOMEM2PHYS(mx_pci_dev_base(is->arch.pci_dev, spec_bar));
    /*
     * map device memory (contiguous)
     */
    switch (mem_type) {
    case MX_MEM_SRAM:
      phys = phys_base + (uintptr_t)kva - (uintptr_t)is->lanai.sram;
      break;
    case MX_MEM_CONTROL:
      phys = phys_base + (uintptr_t)kva - (uintptr_t)is->lanai.control_regs;
      break;
    case MX_MEM_SPECIAL:
      phys = phys_base + (uintptr_t)kva - (uintptr_t)is->lanai.special_regs;
      break;
    default:
      phys = 0; /* Placate compiler. */
      MX_WARN(("mx_mmap_off_to_kva returned with unknown memory type %d\n",
	       mem_type));
      mx_always_assert(0);
      break;
    }

    /* We must disable caching on device memory */
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
    if (mx_std_uc) 
      {
	prot = pgprot_noncached(vma->vm_page_prot);
      } 
    else
#endif
      {
      pgprot_val (prot) &= ~MX_LINUX_PAGE_CACHE;
      pgprot_val (prot) |= MX_LINUX_PAGE_NOCACHE | mx_page_pat_attr;
      }

    if (mx_io_remap_pfn_range(vma, start,  mx_linux_pfn(phys), len, prot)) {
      MX_WARN (("mx_io_remap_pfn_range failed: 0x%lx, 0x%08x%08x, 0x%lx\n",
		start, MX_HIGHPART_TO_U32(phys),
		MX_LOWPART_TO_U32(phys), len));
      goto abort_with_mutex;
    }
  }
  
  mx_mutex_exit(&es->sync);
  return(0);

  abort_with_mutex:

  mx_mutex_exit(&es->sync);
  return -1;
  
}

void 
mx_assertion_failed (const char *assertion, int line, const char *file)
{
  printk("MX: assertion: <<%s>>  failed at line %d, file %s\n",
	 assertion, line, file);
  BUG();
}
/*********************************************************************
 * kernel memory allocation functions
 *********************************************************************/
/*
 * poor man's memory leak detection
 */
#if MX_DEBUG
static int kmalloc_cnt = 0, kfree_cnt = 0;
static int vmalloc_cnt = 0, vfree_cnt = 0;
//static int ioremap_cnt = 0, iounmap_cnt = 0;
//static int dma_alloc_cnt = 0, dma_free_cnt = 0;
static int kernel_alloc_cnt = 0, kernel_free_cnt = 0;
//static int user_pin_cnt = 0, user_unpin_cnt = 0;
#endif

unsigned long mx_max_user_pinned_pages;
static unsigned long mx_max_user_pinned_pages_start;
static unsigned long mem_total_pages;
static int mx_linux_pci_driver_registered = 0;

/* with little mem, reserve half memory */
/* Found that 3/4 was too much on some boxes with linux-2.4 */
#define MX_MAX_USER_PINNED_SMALLMEM(x) (((x)*4)/8)
/* with average amount of memory, preserve a fix amount */
#define MX_MAX_SAVE_FROM_PINNED (64*1024*1024/PAGE_SIZE)
/* with a lot of mem, divide first to avoid overflow,
   and reserve a part proportional to memsize */
#define MX_MAX_USER_PINNED_BIGMEM(x) (((x)/8)*7)

/****************************************************************
 * Synchronization functions
 ****************************************************************/

void
mx_spin_lock_init(mx_spinlock_t *s, mx_instance_state_t *is, int unique, char *str)
{
  *s = SPIN_LOCK_UNLOCKED;
}

void
mx_sync_init (mx_sync_t *s, mx_instance_state_t *is, int unique, char *str)
{
  init_MUTEX(&s->mutex);
  init_MUTEX(&s->wake_sem);
  atomic_set(&s->wake_cnt, 0);
  init_waitqueue_head(&s->sleep_queue);
}

void
mx_sync_reset (mx_sync_t *s)
{
  atomic_set(&s->wake_cnt, 0);
}

void
mx_sync_destroy(mx_sync_t *s)
{
}

void
mx_mutex_enter(mx_sync_t *s)
{
  down(&(s->mutex));
}

void
mx_mutex_exit(mx_sync_t *s)
{
  up(&(s->mutex));
}

/*****************************************************************
 * Sleep functions
 *****************************************************************/

/* The interrupt handler atomically increments WAKE_CNT each time a
   wake interrupt is received and the user threads decrementing 
   WAKE_CNT each time they claim a wake interrupt.   */

/****************
 * waking
 ****************/

/* Wake the thread sleeping on the synchronization variable. */

void
mx_wake(mx_sync_t * s)
{
  MX_DEBUG_PRINT (MX_DEBUG_SLEEP, ("mx_wake called on s = %p\n", s));

  /* record the wake interrupt by incrementing the wake count.  This
     need to be atomic because disabling interrupt globally on SMP 
     is very costly. */

  atomic_inc(&s->wake_cnt);
  wake_up(&s->sleep_queue);
}

/****************
 * sleeping
 ****************
 
 The following code claims a wake interrupt by atomically testing for a
 positive WAKE_CNT and decrementing WAKE_CNT.  We can assume we are the
 only one trying to consume wake_cnt, the caller is responsible to get a
 mutex to ensure this, so wake_cnt can only increase while we are here.
 A basic Linux rule: if you need to disable interrupts globally, your
 code is not written the right way :-) */

/* sleep until awakend, timeout or signal */

int
mx_sleep(mx_sync_t *s, int ms, int flags)
{
  long timeout;
  int ret = 0;
  DECLARE_WAITQUEUE(wait, current);

  MX_DEBUG_PRINT (MX_DEBUG_SLEEP, ("mx_sleep  sync = %p  ms=%d\n",
				   s, ms));
  /* take the mutex */
  down(&s->wake_sem);

  /* set the timeout.  If ms is zero, sleep forever */
  if (ms == MX_MAX_WAIT) {
    timeout = MAX_SCHEDULE_TIMEOUT;
  } else {
    timeout = ((long)ms * HZ)/1000;
    if (timeout == 0 && ms)
      timeout = 1;
  }
 
  /* put the process in the queue before testing the event */
  add_wait_queue(&s->sleep_queue, &wait);
  while (timeout > 0 && 
	 atomic_read(&s->wake_cnt) <= 0) {
    /* use MX_SLEEP_INTR to allow signals */
    if (flags & MX_SLEEP_INTR) {
      set_current_state(TASK_INTERRUPTIBLE);
      if (signal_pending(current)) {
	set_current_state(TASK_RUNNING);
	break;
      }
    } else {
      set_current_state (TASK_UNINTERRUPTIBLE);
    }
    /* test again wake_cnt after setting current->state to avoid
       race condition */
    if (atomic_read(&s->wake_cnt) <= 0)
      timeout = schedule_timeout(timeout);
    /* reset state to RUNNING in case the if was not taken */
    set_current_state(TASK_RUNNING);
  }

  remove_wait_queue(&s->sleep_queue, &wait);

  if (atomic_read(&s->wake_cnt) <= 0) {
    /* no interrupt, timed out */
    if (timeout <= 0) {
      ret = EAGAIN;
    } else {
      ret = EINTR;
      mx_always_assert(signal_pending(current));
    }
  } else {
    /* claims the interrupt */
    atomic_dec(&s->wake_cnt);
  }

  /* release the mutex */
  up(&s->wake_sem);
  MX_DEBUG_PRINT (MX_DEBUG_SLEEP, ("mx_sleep  sync = %p  timeout=%ld, ret = %d\n",
				   s, timeout, ret));

  return ret;
}

/***************************************************************
 * User/kernel memory page pining/unpining
 ***************************************************************/

/***************************************************************
 * User memory page locking/unlocking
 ***************************************************************/
static void mx_activate_page(struct page * page)
{
  /* locked page on some linux versions might jam the inactive_dirty
     list, if we ensure they are active after being locked they should
     never end-up there */
#if defined HAVE_ASM_RMAP_H && LINUX_XX == 24
  if (mx_activate_page_symbol && LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,10))
    {
      typedef void FASTCALL(page_func(struct page*));
      page_func *func;
      func = (page_func *)mx_activate_page_symbol;
      (*func)(page);
    }
#elif LINUX_XX >= 26
  SetPageReferenced(page);
  mark_page_accessed(page);
#endif
}

int
mx_lock_one_page(struct page *page, mx_page_pin_t *pin, struct vm_area_struct *vma)
{
      
  unsigned count;
  int was_locked = 0;
  int status;
  int count_mx;

  if (!page) {
    /* page might be swapped or read-only, caller should force write
       swapin and try again */
    return EAGAIN;
  }
  pin->private = !(vma->vm_flags & (VM_SHARED | VM_HUGETLB));
  mx_always_assert(pin->private);
  pin->page = page;
  count = page_count(page);
  count_mx = (count >> 16) & 0xfff;

  if (count == 0) {
    MX_WARN (("trying to register a page with count 0\n"));
    return ENXIO;
  }
  
  if (pin->private) {
    was_locked = mx_TryLockPage (page);
    /* make sure the page is not already locked by somebody else */  
    if (was_locked && (count_mx == 0)) {
      MX_INFO (("trying to register an already locked page:OK\n"));
      return EAGAIN;
    }
    mx_activate_page(page);
  }
  /* make sure we won't be taking the count too high */
    
  if (count_mx >= 254UL) {
    MX_NOTE (("Not trying to register a page more than 255 times"
	      "    pages=%p , count =0x%x\n",
	      page, count));
    status = ERANGE;
    goto out_with_page_locked;
  }
  
  /* make sure the count hasn't gotten too high without our help.
     use 1 << 15 rather than (1 << 16) - 1 to leave a bit more margin */
  
  if ((count & 0xffff) >= (1UL << 15)) {
    MX_NOTE (("Not trying to register a page with an overly large count"
	      " page=%p\n", page));
    status = ENXIO;
    goto out_with_page_locked;
  }
  
  mx_max_user_pinned_pages -= (count_mx == 0);
  atomic_add (1UL << 16, &page->mx_p_count);
  return 0;
  
 out_with_page_locked:
  if (pin->private && !was_locked)
    unlock_page(page);
  return status;
}

void
mx_unlock_one_page(mx_page_pin_t *pin)
{
  int count;
  int count_mx;
  struct page * page = pin->page;

  mx_always_assert(pin->private);
  count = page_count(page);
  count_mx = (count >> 16) & 0xfff;
  if (count_mx == 1) {
    mx_max_user_pinned_pages += 1;
    if (pin->private) {
      unlock_page(page);
    }
  }
  mx_assert (!PageReserved (page));
#if MX_PAGE_COUNT_TO_ZERO
  if (atomic_sub_and_test (1UL << 16, &page->count)) {
    get_page(page);
    page_cache_release(page);
  }
#else
  atomic_sub ((1UL << 16) - 1, &page->mx_p_count);
  page_cache_release(page);
#endif
}

int
mx_lock_pages(mx_instance_state_t *is, mx_page_pin_t *pins, int npages, int flags,
	      struct mm_struct *mm)
{
  struct vm_area_struct *vma;
  int status = 0;
  int i;
  unsigned locked_num = 0;
  unsigned dma_num = 0;
  int locked_zone = 0;
  struct page *page;
  mx_page_pin_t * pin;
  int priv_page = 0;
  struct page * current_huge_page = NULL;
  uint64_t current_huge_va = 0;
#if 0
  unsigned long va = (unsigned long) pins[0].va;
#endif

  if (flags & MX_PIN_PHYSICAL) {
    for(i=0, pin=pins; i<npages; i++, pin++) {
      /* physical address, so just find the DMA address since we
	 assume the user has already locked it. */
      mx_assert((pin->va & (PAGE_SIZE - 1)) == 0);
      page = mx_linux_phys_to_page(pin->va);
      status = mx_dma_map(is, page, &pin->dma);
      if (status != 0) {
	int j;
	for(j=0; j<i; j++)
	  mx_dma_unmap(is, &pins[j].dma);
	break;
      }
    }
    return status;
  }

  if (flags & MX_PIN_KERNEL) {
    for (i=0, pin=pins; i < npages; i++, pin++) {
      /* kernel address, so just find the DMA address since in linux,
	 kernel addresses are always wired */
      page = mx_kva_to_page(is, pin->va, 1, NULL);
      mx_assert(page);
      status = mx_dma_map(is, page, &pin->dma);
      if (status != 0) {
	int j;
	for(j=0; j<i; j++)
	  mx_dma_unmap(is, &pins[j].dma);
	break;
      }
    }
    return status;
  }

  if (!mm)
    mm = current->mm;
  mx_mmap_down_write(mm);
  vma = find_vma(mm, pins[0].va);
  if (!vma) {
    MX_WARN (("No vma for addr: 0x%lx\n", (unsigned long)pins[0].va));
    status = EIO;
    goto out_error;
  }
  priv_page = !(vma->vm_flags & (VM_SHARED | VM_HUGETLB));
  if (priv_page) {
    spin_lock(&mm->page_table_lock);
    spin_lock(&mx_pin_lock);
    locked_zone = 1;
  }
  if (mx_max_user_pinned_pages < npages) {
    status = ENOMEM;
    MX_WARN(("max_user_pinned_pages limit(%ld mb) reached, available=%ld kb\n", 
	     mx_pages_to_mb(mx_max_user_pinned_pages_start),
	     mx_max_user_pinned_pages*PAGE_SIZE/1024));
    goto out_error;
  }
  for (dma_num=0, pin=pins;dma_num<npages;dma_num++,pin++) {
    int retry = 0;
    do {
#if (defined CONFIG_HUGETLB_PAGE) && (defined HPAGE_SHIFT) && !(defined MX_CPU_ia64)
      /* HPAGE_SHIFT is a non-exported variable on ppc between at least 2.6.15 and 2.6.18.
       * HPAGE_SHIFT is defined to hpage_shift on ia64, and hpage_shift is not exported (at least since 2.6.9)
       */
      if (current_huge_page) {
	if ((pin->va >> HPAGE_SHIFT) == (current_huge_va >> HPAGE_SHIFT)) {
	  /* we are in the same huge page ? */
	  page = pin->page = current_huge_page + ((pin->va - current_huge_va) >> PAGE_SHIFT);
	  pin->huge_page_already_pinned = 1;
	  break; /* next step is mx_dma_map */
	} else {
	  /* we went out of the huge page */
	  current_huge_page = NULL;
	  current_huge_va = 0;
	}
      }
#endif
      pin->huge_page_already_pinned = 0;
      vma = find_vma(mm, pin->va);
      if (!vma || priv_page != !(vma->vm_flags & (VM_SHARED | VM_HUGETLB))) {
	MX_WARN (("Bad vma (%p, fl=0x%lx,priv=%d) for addr: 0x%lx\n", 
		  vma, vma ? vma->vm_flags : 0, priv_page, (unsigned long)pin->va));
	status = EIO;
	goto out_error;
      }
      if (priv_page) {
	page = mx_kva_to_page(is, pin->va, 0, mm);
	status = mx_lock_one_page(page, pin, vma);
      } else {
	if (get_user_pages(current, current->mm, (unsigned long)pin->va, 1, 1,
			   0, &page, NULL) == 1) {
	  pin->private = priv_page;
	  pin->page = page;
	  if (vma->vm_flags & VM_HUGETLB) {
	    current_huge_page = page;
	    current_huge_va = pin->va;
	  }
	  status = 0;
	} else {
	  status = EFAULT;
	}
      }
      if (status == EAGAIN) {
	/* tlb miss or locked page */
	if ((vma->vm_flags & (VM_MAYWRITE | VM_WRITE | VM_SHARED)) == VM_MAYWRITE) {
	  mx_printf_once("Successfully forcing private ro mapping"
			 " to rw for registration\n");
	  vma->vm_flags |= VM_WRITE;
	}
	
	spin_unlock (&mx_pin_lock);
	spin_unlock (&mm->page_table_lock);
	locked_zone = 0;
	if (get_user_pages(current, mm, pin->va, 1, 1, 0, &page, 0) != 1) {
	  status = EFAULT;
	  MX_WARN (("No valid page found by get_user_page for this address: 0x%lx\n",
		    (unsigned long) pin->va));
	  goto out_error;
	}
	/* MX only lock private pages, mapped once in the AS,
	   the page cannot be locked by ourselves */
	wait_on_page_locked(page);
	page_cache_release(page);
	page = 0;
	if (retry++ >= 3) {
	  MX_WARN(("tlb miss of address %lx 3 times in a row\n",
		   (unsigned long) pin->va));
	  status = ENXIO;
	  goto out_error;
	}
	spin_lock(&mx_pin_lock);
	spin_lock(&mm->page_table_lock);
	locked_zone = 1;
      } else if (status) {
	goto out_error;
      }
    } while (status);

    mx_always_assert(!priv_page || PageLocked(page));
    locked_num += 1;
    status = mx_dma_map(is, page, &pin->dma);
    if (status) {
      static int seen;
      if (!seen++) {
	MX_WARN (("mx_dma_map failed\n"));
	goto out_error;
      }
    }
  }
  if (priv_page) {
    spin_unlock (&mx_pin_lock);
    spin_unlock(&mm->page_table_lock);
  }
  mx_mmap_up_write(mm);
  
  return status;

 out_error:
  if (!locked_zone) {
    spin_lock(&current->mm->page_table_lock);
    spin_lock(&mx_pin_lock);
  }
  for (i=0;i<dma_num;i++) {
    mx_dma_unmap(is, &pins[i].dma);
  }
  for (i=0;i<locked_num;i++) {
    if (priv_page)
      mx_unlock_one_page(pins + i);
    else
      put_page(pins[i].page);
  }
  spin_unlock(&mx_pin_lock);
  spin_unlock(&mm->page_table_lock);
  mx_mmap_up_write(mm);
  return status;
}

int 
mx_pin_host_pages(mx_instance_state_t *is, mx_page_pin_t *pins,
		  mcp_dma_addr_t *mdesc, int npages, int flags,
		  uint64_t memory_context)
{
  int i, status;
  status = mx_lock_pages(is, pins, npages, flags, (struct mm_struct *)(unsigned long) memory_context);
  if (status) {
    return status;
  }
  for (i=0;i<npages;i++) {
    mdesc[i].low = htonl(pins[i].dma.low);
    mdesc[i].high = htonl(pins[i].dma.high);
  }
  return 0;
}


void
mx_unpin_host_pages(mx_instance_state_t *is, mx_page_pin_t *pins, int npages, int flags)
{
  int i;

  spin_lock (&mx_pin_lock);
  for (i=0; i < npages;i++) {
    mx_page_pin_t *pin = pins + i;

    mx_dma_unmap(is, &pin->dma);
    
    if (pin->huge_page_already_pinned)
      continue;

    if (flags & (MX_PIN_KERNEL|MX_PIN_PHYSICAL)) {
      continue;
    }
    
    BUG_ON(!pin->page);
    if (pin->private) {
      mx_unlock_one_page (pin);
    } else {
      put_page(pin->page);
    }
  }
  spin_unlock (&mx_pin_lock);
}



int 
mx_pin_page(mx_instance_state_t *is, mx_page_pin_t *pin, int flags, uint64_t memory_context)
{
  return mx_lock_pages(is, pin, 1, flags, (struct mm_struct *)(unsigned long) memory_context);
}

void
mx_unpin_page(mx_instance_state_t *is, mx_page_pin_t *pin, int flags)
{
  mx_unpin_host_pages(is, pin, 1, flags);
}


int
mx_rand(void)
{
  int ret;
  
  get_random_bytes(&ret, sizeof(ret));
  return ret;
}

void *
mx_kmalloc (size_t len, uint32_t flags)
{
  void *retval;

  MX_DEBUG_INC (MX_DEBUG_MALLOC, kernel_alloc_cnt);


  /* 64 is a safe value, anyway it will work even if the threshold
     does not exactly correspond to kmalloc internals */
  if (len <= PAGE_SIZE) {
    retval =  kmalloc(len, GFP_KERNEL);
    if (retval)
      MX_DEBUG_INC (MX_DEBUG_MALLOC, kmalloc_cnt);
  } else {
    retval = vmalloc(len);

    if (retval) {
      MX_DEBUG_INC (MX_DEBUG_MALLOC, vmalloc_cnt);
    }
  }
  if (retval && (flags & MX_MZERO)) {
    memset((char *)retval, 0, len);
  }
  return retval;
}

void
mx_kfree (void *ptr)
{

  MX_DEBUG_INC (MX_DEBUG_MALLOC, kernel_free_cnt);
  if ((ptr > (void *) PAGE_OFFSET) && (ptr < (void *) high_memory)) {
    MX_DEBUG_INC (MX_DEBUG_MALLOC, kfree_cnt);
    kfree(ptr);
  } else {
    MX_DEBUG_INC (MX_DEBUG_MALLOC, vfree_cnt);
    vfree(ptr);
  }
}

/*********************************************************************
 * memory mapping (into kernel space)
 *********************************************************************/

void *
mx_map_pci_space (mx_instance_state_t * is, int bar, uint32_t offset, uint32_t len)

{
  void *kaddr;
  unsigned long iomem = mx_pci_dev_base(is->arch.pci_dev, bar);

  MX_DEBUG_PRINT(MX_DEBUG_BOARD_INIT,
            ("mx_map_io_space(%p, 0x%x, %d)\n", is, offset, len));

  if (offset + len > pci_resource_len(is->arch.pci_dev, bar)
      || !(pci_resource_flags(is->arch.pci_dev, bar) & IORESOURCE_MEM)
      || !iomem)
    return 0;
  kaddr = ioremap_nocache(iomem + (unsigned long) offset, len);

#if defined IO_TOKEN_TO_ADDR
  kaddr = (void *) IO_TOKEN_TO_ADDR(kaddr);
#endif

  MX_DEBUG_PRINT(MX_DEBUG_BOARD_INIT,
            ("ioremapped 0x%p (offset 0x%x, len 0x%x)\n",
             kaddr, offset, len));

  return kaddr;
}


void
mx_unmap_io_space (mx_instance_state_t * is,
                        uint32_t len, void *kaddr)
{

  MX_DEBUG_PRINT(MX_DEBUG_BOARD_INIT,
            ("iounmapping %p (len 0x%x)\n",
             kaddr, len));

  iounmap(kaddr);
}

#ifdef CONFIG_MTRR
static void
mx_enable_pat(void *info)
{
  /* use PAT 6 */
  static spinlock_t l = SPIN_LOCK_UNLOCKED;

  unsigned id = smp_processor_id();
  uint32_t low, high;

  spin_lock(&l);
  rdmsr(IA32_MSR_CR_PAT, low, high);
  MX_INFO(("CPU%d: PAT = 0x%x%08x\n", id, high, low));
  if ((high & 0xff0000) != 0x010000) {
    high = (high & 0xff00ffff) | 0x00010000;
    wrmsr(IA32_MSR_CR_PAT, low, high);
    rdmsr(IA32_MSR_CR_PAT, low, high);
    MX_INFO(("CPU%d: new PAT = 0x%x%08x\n", id,high, low));
  }
  spin_unlock(&l);
}
#endif

static inline void 
mx_setup_writecomb(mx_instance_state_t *is)
{
  is->arch.mtrr = -1;
#ifdef CONFIG_MTRR
  is->arch.mtrr = mtrr_add(mx_pci_dev_base(is->arch.pci_dev, 0), is->board_span,
			   MTRR_TYPE_WRCOMB, 1);
  if (is->arch.mtrr < 0) {
    MX_WARN(("Board %d: Using PAT\n", is->id));
    (void)smp_call_function(mx_enable_pat, 0, 0, 1);
    mx_enable_pat(0);
#ifndef CONFIG_XEN		/* _PAGE_PSE is not yet supported by Xen */
    mx_page_pat_attr = _PAGE_PSE;
#endif
  } else
    MX_INFO(("Board %d: Write Combining enabled through mtrr %d\n", mx_num_instances, is->arch.mtrr));
#endif /* CONFIG_MTRR */
}

static inline void 
mx_teardown_writecomb(mx_instance_state_t *is)
{
#ifdef CONFIG_MTRR
  if (is->arch.mtrr >= 0)
    mtrr_del(is->arch.mtrr, mx_pci_dev_base(is->arch.pci_dev, 0), is->board_span);
#endif /* CONFIG_MTRR */
  is->arch.mtrr = -1;
}

static irqreturn_t
mx_linux_intr (int irq, void *instance_id, struct pt_regs *regs)
{
  mx_instance_state_t *is = instance_id;
  int handled;
  unsigned long flags;

  if (mx_bh_intr == 0) {
    handled = mx_common_interrupt(is);
    return IRQ_RETVAL(handled);
  }
#if MX_DRIVER_API_MAGIC >= 0x500
  if (is->intr.ring_mask == 0)
    return IRQ_RETVAL(0);

  if (mx_intr_slot(is, is->intr.slot)->type == 0)
    return IRQ_RETVAL(0);
#else
  if (is->intr.maxslots == 0)
     return IRQ_RETVAL(0);
 
  if (is->intr.q[is->intr.intrq][is->intr.slot].type == 0)
    return IRQ_RETVAL(0);

#endif

  spin_lock_irqsave(&is->arch.intr_pending_lock, flags);
  if (!is->arch.intr_pending) {
    is->arch.intr_pending = 1;
    is->board_ops.disable_interrupt(is);
    mx_lxx_schedule_work(&is->arch.intr_work);
  }
  spin_unlock_irqrestore(&is->arch.intr_pending_lock, flags);
  return IRQ_RETVAL(1);
}

static void
mx_devfs_register (char *devname, mx_instance_state_t *is, int minor)
{
  char name[10];
  umode_t mode;
#if LINUX_XX <= 24
  devfs_handle_t *devfs_handle;
#endif
  int priv = minor & 1;

  mode = priv ? (S_IRUSR | S_IWUSR) : (S_IRUGO | S_IWUGO);

  if (minor != MX_CTL && minor != MX_CTLP) {
    sprintf(name, "%s%d", devname, minor/2);
#if LINUX_XX <= 24
    devfs_handle = &is->arch.devfs_handle[priv];
#endif
  }
  else {
    sprintf(name, "%s", devname);
#if LINUX_XX <= 24
    devfs_handle = &mx_devfs_handle[priv];
#endif
  }
#if LINUX_XX >= 26
  devfs_mk_cdev(MKDEV(MX_MAJOR, minor), S_IFCHR | mode, name);
  if (mx_udev) {
    mx_class_device_create(mx_class, MKDEV(MX_MAJOR, minor),
			  NULL, name);
  }
#else
  *devfs_handle =
    devfs_register(NULL, name, DEVFS_FL_DEFAULT, MX_MAJOR, minor,
                    S_IFCHR | mode, &mx_fops, 0);
#endif
}


/****************************************************************
 * mx_linux_register_ioctl32()
 * Registers 32 bit ioctls on 64 bit systems.
 ****************************************************************/
#if (MX_CPU_x86_64 || MX_CPU_powerpc64) && !defined HAVE_COMPAT_IOCTL
extern int register_ioctl32_conversion(unsigned int cmd,
				       int (*handler)(unsigned int,
						      unsigned int,
                                                      unsigned long,
                                                      struct file *));
extern int unregister_ioctl32_conversion(unsigned int cmd);

static int
mx_linux_register_ioctl32(void)
{
  unsigned int i;
  int err;

  MX_INFO (("Registering 32 bit ioctls\n"));
  for (i=1; i<= MX_NUM_IOCTLS; i++)
    {
      if ((err = register_ioctl32_conversion( MX_IO(i), NULL)) != 0)
	{
	  MX_WARN(("Couldn't register 32 bit ioctl %d (0x%x)\n",
		   i, MX_IO(i)));
	  return err;
	}
    }
  return 0;
}

static int
mx_linux_unregister_ioctl32 (void)
{
  int i, err;
  for (i=1; i<= MX_NUM_IOCTLS; i++)
    {
      if ((err = unregister_ioctl32_conversion (MX_IO(i))) != 0)
	return err;
    }
  return 0;

}
#endif /* MX_CPU_x86_64 || MX_CPU_powerpc64*/

static unsigned long 
mx_pci_dev_base(struct pci_dev *dev, int bar)
{
  unsigned long base;

  base = pci_resource_start (dev, bar);

#if MX_CPU_powerpc64
  /* - on ppc64 with eeh on, the kernel hides the real address without
     providing any way to map its fake handle to use space. In
     both cases get the information from the PCI register who never lies. */
  if ((base >> 60UL) > 0) {
    /* we got either a fake (token), or a already mapped address */
    unsigned int bus_base;
    struct pci_controller *hose = PCI_DN(PCI_GET_DN(dev))->phb;
    pci_read_config_dword(dev, PCI_BASE_ADDRESS_0 + bar * 4, &bus_base);
    bus_base &= PCI_BASE_ADDRESS_MEM_MASK;
    MX_WARN(("Linux faking pci_resource_start:pci_resource_start=0x%lx,"
	     "bus_base=0x%x,hose->pci_mem_offset=%lx\n",
	     pci_resource_start (dev,0), bus_base,
	     hose ? hose->pci_mem_offset : 0));
    if (bus_base && hose)
      base = bus_base + hose->pci_mem_offset;
  }	
#endif /* MX_CPU_powerpc64 */
  return base;
}

#if LINUX_XX >= 26

static struct cdev mx_cdev = {
  .kobj   = {.name = "mx", },
  .owner  = THIS_MODULE,
};

static struct cdev mxctl_cdev = {
  .kobj   = {.name = "mxctl", },
  .owner  = THIS_MODULE,
};


static int
mx_linux_class_init(void)
{
  if (mx_udev) {
    mx_class = mx_class_create(THIS_MODULE, "mx");
    if (mx_class == NULL) {
      MX_WARN(("mx_class_create returned %p\n", mx_class));
      return ENXIO;
    }
  }
  return 0;
}

static void
mx_linux_class_fini(void)
{
  if (mx_udev) {
    mx_class_destroy(mx_class);
  }
}

static int
mx_linux_cdev_init(void)
{
  int err;
  dev_t mx_dev = MKDEV(MX_MAJOR, 0);
  dev_t mxctl_dev = MKDEV(MX_MAJOR, MX_CTL);

  err = register_chrdev_region(mx_dev, 2*mx_max_instance, "mx");
  if (err != 0) {
    MX_WARN(("register_chrdev_region failed for mx devices with status %d\n", err));
    goto out;
  }
  cdev_init(&mx_cdev, &mx_fops);
  err = cdev_add(&mx_cdev, mx_dev, 2*mx_max_instance);
  if (err != 0) {
    MX_WARN(("cdev_add() failed for mx devices with status %d\n", err));
    goto out_with_mx_region;
  }

  err = register_chrdev_region(mxctl_dev, 2, "mxctl");
  if (err != 0) {
    MX_WARN(("register_chrdev_region failed for mxctl devices with status %d\n", err));
    goto out_with_mx_cdev;
  }
  cdev_init(&mxctl_cdev, &mx_fops);
  err = cdev_add(&mxctl_cdev, mxctl_dev, 2);
  if (err != 0) {
    MX_WARN(("cdev_add() failed for mxctl devices with status %d\n", err));
    goto out_with_mxctl_region;
  }
  return 0;

 out_with_mxctl_region:
  unregister_chrdev_region(mxctl_dev, 2);
 out_with_mx_cdev:
  cdev_del(&mx_cdev);
 out_with_mx_region:
  unregister_chrdev_region(mx_dev, 2*mx_max_instance);
 out:
  return err;
}

static void
mx_linux_cdev_fini(void)
{
  cdev_del(&mx_cdev);
  unregister_chrdev_region(MKDEV(MX_MAJOR, 0), 2*mx_max_instance);
  cdev_del(&mxctl_cdev);
  unregister_chrdev_region(MKDEV(MX_MAJOR, MX_CTL), 2);
}

#endif /* LINUX_XX >= 26 */

static void
mx_linux_pci_map_init(mx_instance_state_t *is)
{
#if MX_CPU_powerpc64
  struct pci_dev *pdev;
  int pages_put_aside;
  int myri_users = 0, other_users = 0;
  mx_ppc64_iommu_t tbl = MX_PPC64_IOMMU(is->arch.pci_dev);
  if (!tbl) {
    MX_INFO(("mx%d: no iommu found\n", is->id));
    return;
  }
  pdev = 0;
  while ((pdev = pci_find_device(PCI_ANY_ID, PCI_ANY_ID, pdev))) {
    if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
      continue;
    if (PCI_GET_DN(pdev) && PCI_DN(PCI_GET_DN(pdev)) && 
	MX_PPC64_IOMMU(pdev) == tbl) {
      if (pdev == is->arch.pci_dev || pdev->vendor == MX_PCI_VENDOR_MYRICOM)
	myri_users += 1;
      else 
	other_users += 1;
    }
  }
  /* use 3/4 of a iommu for myri devices (or all but 64MB if not shared) */

  pages_put_aside = MX_PPC64_IOMMU_SIZE(tbl)/4;
  /* if we have a dedicated IOMMU don't put more than 
     64MB aside */
  if (!other_users && pages_put_aside > 32*1024*1024 / PAGE_SIZE)
    pages_put_aside = 32*1024*1024 / PAGE_SIZE;
  atomic_set(&is->arch.free_iommu_pages, 
	     (MX_PPC64_IOMMU_SIZE(tbl) - pages_put_aside)/ myri_users);
  is->arch.has_iommu = 1;
  MX_INFO(("mx%d: using %ld MB of iommu (Table is %ld Mbytes), users=%d/%d\n",
	   is->id, (long)mx_pages_to_mb(atomic_read(&is->arch.free_iommu_pages)),
	   (long)mx_pages_to_mb(MX_PPC64_IOMMU_SIZE(tbl)),
	   myri_users, other_users));
#endif
}

int
mx_start_mapper(mx_instance_state_t *is)
{
  int ret;
  char *exec_path = "/opt/mx/sbin/mx_start_mapper";
  char  unit[32];
  char  *argv[3];
  char  *envp[2];

  if (mx_udev)
    return 0;
  sprintf(unit, "%d", is->id);
  argv[0] = exec_path;
  argv[1] = unit;
  argv[2] = NULL;
  envp[0] = "PATH=/sbin:/usr/sbin:/bin:/usr/bin";
  envp[1] = NULL;
  if (mx_mapper_path != NULL)
    exec_path = mx_mapper_path;
  MX_INFO(("Starting usermode mapper at %s\n", exec_path));
  ret = call_usermodehelper(exec_path, argv, NULL
#if LINUX_XX != 24
			    , 1
#endif
			    );
  if (ret != 0) {
    MX_INFO(("Problem starting usermode mapper, ret = %d\n", ret));
  }
  return ret;
}

int
mx_stop_mapper(mx_instance_state_t *is)
{
  pid_t pid;

  mx_mutex_enter(&is->sync);
  if (is->raw.es == NULL) {
    mx_mutex_exit(&is->sync);
    return 0;
  }
  pid = is->raw.es->opener.pid;

  mx_mutex_exit(&is->sync);
  kill_proc(pid, SIGKILL, 1);

  return 0;

}

void mx_linux_kwindow_timer(unsigned long data)
{
  mx_instance_state_t * is = (void *)data;
  is->kernel_window->jiffies = jiffies;
  is->arch.kwindow_timer.expires = jiffies + 1;
  if (!mx_module_is_exiting)
    add_timer(&is->arch.kwindow_timer);
}


static void
mx_do_bh_intr(void *arg)
{
  unsigned long flags;
  mx_instance_state_t *is = arg;

  mx_common_interrupt(arg);

  spin_lock_irqsave(&is->arch.intr_pending_lock, flags);
  mx_always_assert(is->arch.intr_pending);
  is->arch.intr_pending = 0;
  is->board_ops.enable_interrupt(arg);
  spin_unlock_irqrestore(&is->arch.intr_pending_lock, flags);
}

/****************************************************************
 * mx_linux_create_instance
 *
 * Initializes the myrinet card specified.  If the card is
 *   initialized correctly, it increments mx_num_instances
 *   and adds it into the device array.
 * Arguments:
 *   dev - a pointer to the pci structure for the myrinet card
 * Returns:
 *   0 if card was initialized correctly
 *   -ENODEV otherwise
 ****************************************************************/

/* create a new device. Only at end and if no error occurs, we link it
   and increment mx_linux_num_instance. */
static int
mx_linux_create_instance (struct pci_dev *dev)
{
  mx_instance_state_t *is = NULL;
  int status = 0;
  uint32_t class;
  unsigned short vendor, device;
  uint8_t rev;

  MX_DEBUG_PRINT 
    (MX_DEBUG_BOARD_INIT,
     ("Using mx_linux_create_instance for instance %d\n",
      mx_num_instances));

  if (pci_enable_device(dev)) {
    MX_WARN(("%s:  pci_enable_device failed\n",
	     mx_pci_name(dev)));
    return -ENODEV;
  }
  pci_set_master(dev);

  /*
   * lots of this could be moved to arch-independent code
   * there are just sanity checks + filling of mx_pci_config
   */
  pci_read_config_dword(dev, PCI_CLASS_REVISION, &class);
  rev = (uint8_t)class;
  MX_DEBUG_PRINT (MX_DEBUG_BOARD_INIT,
		    ("Myrinet PCI probe device = %d  revision = 0x%x\n",
		     dev->devfn, class));
  if (class == 0xffffffff) {
    MX_INFO(("PCI config class/rev for %s is 0x%x, did card disappeared?\n",
	     mx_pci_name(dev), class));
    status = -ENODEV;
    goto abort;
  }

  pci_read_config_word(dev, PCI_VENDOR_ID, &vendor);
  pci_read_config_word(dev, PCI_DEVICE_ID, &device);

  MX_DEBUG_PRINT (MX_DEBUG_BOARD_INIT,
		  ("myri_pci_probe testing vendor=0x%x  device=0x%x\n",
		   vendor, device));

  if (vendor != MX_PCI_VENDOR_MYRICOM
      || (device != MX_PCI_DEVICE_MYRINET &&
	  device != MX_PCI_DEVICE_Z4E &&
	  device != MX_PCI_DEVICE_Z8E)) {
    MX_WARN(("Device Id %x:%x for %s is not recognized\n", 
	     vendor, device, mx_pci_name(dev)));
    status = -ENODEV;
    goto abort;
  }

  if (device == MX_PCI_DEVICE_MYRINET && rev < 4)  {
    MX_WARN (("Myrinet Board at %d:%d:%d has unsupported revision (%d)\n",
	      dev->bus->number, PCI_SLOT(dev->devfn),
	      PCI_FUNC(dev->devfn), rev));
    status = -ENODEV;
    goto abort;
  }

  is = (void *) mx_kmalloc(sizeof (*is), MX_MZERO);
  if (!is) {
    MX_WARN (("couldn't get memory for instance_state\n"));
    status = -ENOMEM;
    goto abort;
  }

  /* busbase is the value that can be passed to ioremap
     it might be different from both the PCI address,
     or the phys address or the kernel address
  */
  is->board_span = pci_resource_len(dev, 0);
  mx_bar64_reloc(dev);

  if (mx_pci_dev_base(dev, 0) == 0) {
    MX_WARN (("PCI BARs not accessible: base[0]=0x%lx\n",
	      mx_pci_dev_base(dev, 0)));
    status = -ENODEV;
    goto abort;
  }

  is->arch.pci_dev = dev;
  mx_linux_pci_map_init(is);

  status = pci_set_dma_mask(dev, (dma_addr_t)~0ULL);
  if (status != 0) {
    MX_INFO(("64-bit pci address mask was refused, trying 32-bit"));
    status = pci_set_dma_mask(dev, (dma_addr_t)0xffffffffULL);
  }
  if (status != 0) {
    MX_NOTE(("Error %d setting DMA mask\n", status));
    goto abort;
  }


#ifdef CONFIG_PCI_MSI
  if (mx_use_msi(is->arch.pci_dev)) {
    status = pci_enable_msi(is->arch.pci_dev);
    if (status != 0) {
      MX_WARN(("Error %d setting up MSIs, falling back to legacy interrupts\n",
	       status));
    } else {
      is->msi_enabled = 1;
    }
  }
#endif

  if (dev->device == 8
      && dev->bus->self) {
    /* tweak upstream component of Myrinet pcie express devices */
    mx_pcie_bridge_conf(dev->bus->self);
  }

  is->arch.irq = dev->irq;

  sprintf(is->arch.interrupt_string, "myri/mx%d", 
	   mx_num_instances);
  
  if (request_irq(is->arch.irq, (void *) mx_linux_intr,
                   SA_SHIRQ, is->arch.interrupt_string, is) == 0)  {
    MX_INFO (("Board %d: allocated %s IRQ %d\n", 
	      mx_num_instances, is->msi_enabled ? "MSI" : "legacy", is->arch.irq));
  } else {
    MX_NOTE (("Board %d: Could not allocate %s IRQ %d\n", 
	      mx_num_instances, is->msi_enabled ? "MSI" : "legacy", is->arch.irq));
    status = -ENXIO;
    goto abort_with_msi;
  }

  /* setup write combining */
  mx_setup_writecomb(is);

  is->kernel_window = (void*)__get_free_page(GFP_KERNEL);
  memset(is->kernel_window, 0, PAGE_SIZE);
  if (!is->kernel_window) {
    MX_NOTE (("Failed to allocate kernel window\n"));
    status = -ENOMEM;
    goto abort_with_irq;
  }
  is->kernel_window->hz = HZ;
  mx_reserve_page(is->kernel_window);

  /* generic board initialization; load MCP and stuff */

  MX_LXX_INIT_WORK(&is->arch.intr_work, mx_do_bh_intr, is);
  spin_lock_init(&is->arch.intr_pending_lock);

  status = mx_instance_init(is, mx_num_instances);
  if (status != 0)  {
      MX_NOTE (("mx_instance_init failed\n"));
      goto abort_with_irq;
  }
  pci_set_drvdata(dev,is);
  init_timer(&is->arch.kwindow_timer);
  is->arch.kwindow_timer.data = (unsigned long)is;
  is->arch.kwindow_timer.function = mx_linux_kwindow_timer;
  is->arch.kwindow_timer.expires = jiffies + 1;
  add_timer(&is->arch.kwindow_timer);

  /*
   * prepend to list of MX devices
   */
  mx_devfs_register("mx", is, is->id*2);
  mx_devfs_register("mxp", is, is->id*2 + 1);
  MX_DEBUG_PRINT (MX_DEBUG_BOARD_INIT, 
		    ("mx_instance_init succeeded for unit %d\n",
		     mx_num_instances));
  mx_always_assert(mx_num_instances <= mx_max_instance);
  mx_mutex_exit(&is->sync);

  return 0;

  /* ERROR Handling */
 abort_with_irq:
  MX_DEBUG_PRINT (MX_DEBUG_BOARD_INIT, 
		    ("freeing irq %d\n", is->arch.irq));
  free_irq(is->arch.irq, is);
  
  mx_teardown_writecomb(is);

 abort_with_msi:
#ifdef CONFIG_PCI_MSI
  if (is->msi_enabled)
    pci_disable_msi(is->arch.pci_dev);
  is->msi_enabled = 0;
#endif
 abort:
  if (is && is->kernel_window) {
    mx_unreserve_page(is->kernel_window);
    free_page((unsigned long)is->kernel_window);
  }
  if (is)
    mx_kfree(is);
  return status;
}


static void
mx_linux_destroy_instance (mx_instance_state_t *is)
{
  mx_assert (is != NULL);
  mx_assert(mx_instances[is->id] == is);

  del_timer_sync(&is->arch.kwindow_timer);
  mx_lxx_flush_scheduled_work();

#if LINUX_XX >= 26
  devfs_remove("mxp%d",is->id);
  devfs_remove("mx%d",is->id);
  if (mx_udev) {
    mx_class_device_destroy(mx_class, MKDEV(MX_MAJOR, is->id * 2));
    mx_class_device_destroy(mx_class, MKDEV(MX_MAJOR, is->id * 2 + 1));
  }
#else
  devfs_unregister(is->arch.devfs_handle[0]);
  devfs_unregister(is->arch.devfs_handle[1]);
#endif

  if (is->board_ops.disable_interrupt != NULL)
    is->board_ops.disable_interrupt(is);

  mx_teardown_writecomb(is);
  if (mx_instance_finalize(is) != 0) {
    MX_WARN(("Could not destroy instance, big problems in perspective\n"));
  }
  free_irq(is->arch.irq, is);
#ifdef CONFIG_PCI_MSI
  if (is->msi_enabled)
    pci_disable_msi(is->arch.pci_dev);
  is->msi_enabled = 0;
#endif
  mx_unreserve_page(is->kernel_window);
  free_page((unsigned long)is->kernel_window);
  mx_kfree(is);
}

/****************************************************************
 * mx_linux_init_one
 *
 * Initializes one Myrinet card.  Called by the kernel pci
 *   scanning routines when the module is loaded.
 ****************************************************************/

static int
mx_linux_init_one (struct pci_dev *dev, const struct pci_device_id *ent)
{
  int status;

  if (mx_bus != -1 && dev->bus->number != mx_bus) {
    MX_INFO(("Ignoring Myri device %s because mx_bus == 0x%02x\n", 
	     mx_pci_name(dev), mx_bus));
    return -ENODEV;
  } else if (mx_num_instances >= mx_max_instance) {
    MX_INFO(("Ignoring Myri device %s because mx_max_instance == %d\n", 
	     mx_pci_name(dev), mx_max_instance));
    return -ENODEV;
  } else if ((status = mx_linux_create_instance(dev)) == 0) {
    return 0;
  } else {
    MX_WARN (("Failed to initialize Myrinet Board at %s (%d)\n",
	      mx_pci_name(dev), status));
    return status;
  }
}

/****************************************************************
 * mx_linux_remove_one
 *
 * Does what is necessary to shutdown one Myrinet device. Called
 *   once for each Myrinet card by the kernel when a module is
 *   unloaded.
 ****************************************************************/

static void
mx_linux_remove_one (struct pci_dev *pdev)
{
  mx_instance_state_t *is;

  is = (mx_instance_state_t *) pci_get_drvdata(pdev);
  if (is != NULL) {
    mx_linux_destroy_instance(is);
  }
  pci_set_drvdata(pdev,0);
}


#define MX_PCI_DEVICE_MYRINET 0x8043
#define MX_PCI_VENDOR_MYRICOM 0x14c1

static struct pci_device_id mx_pci_tbl[] = {
#if MX_2G_ENABLED
  {MX_PCI_VENDOR_MYRICOM, MX_PCI_DEVICE_MYRINET,
     PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
#endif
#if MX_10G_ENABLED
  {MX_PCI_VENDOR_MYRICOM, MX_PCI_DEVICE_Z8E,
     PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
#endif
  {0,},
};

static struct pci_driver mx_driver = {
  .name = "mx_driver",
  .probe = mx_linux_init_one,
  .remove = mx_linux_remove_one,
  .id_table = mx_pci_tbl,
};

#if LINUX_XX >= 26
MODULE_DEVICE_TABLE(pci, mx_pci_tbl);
#endif

#if defined UTS_VERSION
static char mx_linux_uts_version[] = UTS_VERSION;
#else
static char mx_linux_uts_version[] = "";
#endif
static char mx_linux_uts_release[] = UTS_RELEASE;

/* Build a string in the driver to indicate the kernel version for
   which it was built.  The string should be in the form of a /bin/sh
   variable setting.  We will extract this string later using the
   "strings" program and install the driver at
   /lib/modules/<UTS_RELEASE>/kernel/drivers/net. */
char *MX_UTS_RELEASE = "MX_UTS_RELEASE=\"" UTS_RELEASE "\"";


#if MX_USE_WATCHDOG_THREAD
static int
mx_watchdog_thread(void *unused)
{
  mx_lxx_daemonize("mx_watchdog");
  while (!mx_module_is_exiting) {
    mx_watchdog_body();
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
    wait_event_interruptible_timeout(mx_watchdog_queue, 
		       mx_module_is_exiting, MX_WATCHDOG_TIMEOUT * HZ);
#else
    interruptible_sleep_on_timeout(&mx_watchdog_queue, MX_WATCHDOG_TIMEOUT * HZ);
#endif
    /* for obsolete kernels where daemonize does not  block all signals */
    flush_signals(current);
  }
  complete_and_exit(&mx_watchdog_completion, 0);
}

#else
static void
mx_watchdog_wrapper(void *unused)
{
  mx_watchdog_body();

  if (!mx_module_is_exiting) {
    /* re-arm the timer */
    mx_watchdog_timer.expires = jiffies + MX_WATCHDOG_TIMEOUT * HZ;
    add_timer(&mx_watchdog_timer);
  }
}	

static void
mx_watchdog(unsigned long unused)
{
  /* schedule the watchdog body */
  if (!mx_module_is_exiting) {
    mx_lxx_schedule_work(&mx_watchdog_work);
  }
}
#endif

void
mx_cleanup_module (void)
{

  mx_module_is_exiting = 1;

#if MX_KERNEL_LIB
  mx_finalize_klib();
#endif

#if MX_USE_WATCHDOG_THREAD
  wake_up(&mx_watchdog_queue);
  wait_for_completion(&mx_watchdog_completion);
#else
  if (mx_watchdog_timer.function) {
    del_timer_sync(&mx_watchdog_timer);
    mx_lxx_flush_scheduled_work();
    del_timer_sync(&mx_watchdog_timer);
    /* no timer or work can be scheduled anymore:
       a pending timer cannot exists because of exiting=1;flush;del_timer
       a scheduled work cannot exists because of exiting=1;del_timer;flush
    */
  }
#endif
#if (MX_CPU_x86_64 || MX_CPU_powerpc64) && !defined HAVE_COMPAT_IOCTL
  mx_linux_unregister_ioctl32();
#endif /* MX_CPU_x86_64 || MX_CPU_powerpc64 */

  /* this call is responisible for tearing down each instance */
  if (mx_linux_pci_driver_registered)
    pci_unregister_driver(&mx_driver);

  (void) mx_finalize_driver();
#if MX_DEBUG  
  if (kmalloc_cnt == kfree_cnt && vmalloc_cnt == vfree_cnt)
    MX_DEBUG_PRINT (MX_DEBUG_MALLOC, ("No leaks\n"));
  else
    MX_DEBUG_PRINT (MX_DEBUG_MALLOC, 
		    ("MX: Memory leak info:\n"
		     "\t kmallocs  = %d\n"
		     "\t vmallocs  = %d\n"
		     "\t kfrees    = %d\n"
		     "\t vfrees    = %d\n",
		     kmalloc_cnt, vmalloc_cnt, kfree_cnt, vfree_cnt));
#endif

#if LINUX_XX >= 26
  devfs_remove("mx%s", "ctl");
  devfs_remove("mx%s", "ctlp");
  if (mx_udev) {
    mx_class_device_destroy(mx_class, MKDEV(MX_MAJOR, MX_CTL));
    mx_class_device_destroy(mx_class, MKDEV(MX_MAJOR, MX_CTLP));
  }
#else
  devfs_unregister(mx_devfs_handle[0]);
  devfs_unregister(mx_devfs_handle[1]);
#endif

#if LINUX_XX >= 26
  mx_linux_class_fini();
  mx_linux_cdev_fini();
#else
  unregister_chrdev(MX_MAJOR, "mx");
#endif
  MX_INFO (("driver unloaded\n"));
  
}

int
mx_init_module (void)
{
  struct sysinfo mem_info;
#if MX_KERNEL_LIB
  int error;
#endif

  MX_INFO (("On %s, kernel version: %s %s\n", 
	    mx_current_utsname.machine,
	    mx_current_utsname.release,
	    mx_current_utsname.version));
  if (strcmp(mx_current_utsname.release, mx_linux_uts_release) != 0 ||
      strcmp(mx_current_utsname.version, mx_linux_uts_version) != 0) {
    MX_INFO (("MX module compiled with kernel headers of %s %s\n",
	      mx_linux_uts_release, mx_linux_uts_version));
  }
  
#if defined CONFIG_X86_PAE
  if (!cpu_has_pae || !(read_cr4() & X86_CR4_PAE)) {
      MX_WARN (("Trying to load PAE-enabled module on a non-PAE kernel\n"
		"please recompile appropriately\n"));
      return -ENODEV;
  }
#elif MX_CPU_x86 
  if (cpu_has_pae && (read_cr4() & X86_CR4_PAE)) {
    MX_WARN (("Trying to load non-PAE module on a PAE-enabled kernel\n"
		"please recompile appropriately\n"));
    return -ENODEV;
  }
#endif

#if defined(CONFIG_SMP)
  MX_DEBUG_PRINT (MX_DEBUG_BOARD_INIT,
		    ("MX driver compiled with CONFIG_SMP enabled\n"));
#else
  /* XXX some way to check if kernel is SMP without looking at symbol table?*/
  MX_DEBUG_PRINT (MX_DEBUG_BOARD_INIT,
		    ("MX driver compiled without CONFIG_SMP enabled\n"));
#endif

  si_meminfo(&mem_info);
  mem_total_pages = mem_info.totalram;

  /*  this is sanity limits on registered memory to prevent the user
      from bringing the system to a complete stop */
  if (mem_total_pages >= (2 * MX_MAX_SAVE_FROM_PINNED)) {
    /* system with quite enough memory */
    unsigned long max_big;
    mx_max_user_pinned_pages_start = mem_total_pages - MX_MAX_SAVE_FROM_PINNED;
    max_big =	MX_MAX_USER_PINNED_BIGMEM(mem_total_pages);
    if (mx_max_user_pinned_pages_start > max_big)
      mx_max_user_pinned_pages_start = max_big;
  } else {
    mx_max_user_pinned_pages_start
      = MX_MAX_USER_PINNED_SMALLMEM(mem_total_pages);
  }

  mx_max_user_pinned_pages = mx_max_user_pinned_pages_start;

  MX_INFO (("Memory available for registration: %ld pages (%ld MBytes)\n",
	      mx_max_user_pinned_pages_start,
	      mx_max_user_pinned_pages_start >> (20 - PAGE_SHIFT)));

#if defined HAVE_ASM_RMAP_H && LINUX_XX == 24
#if MX_CPU_ia64
  {
    /* the symbol from system.map are code pointers,
       C function pointers  points to a descriptor (code_ptr,got_ptr)
       compare and reconstruct accordingly */
    static struct fptr {
      uint64_t code;
      uint64_t got;
    } func_desc;
    struct fptr *s = (void *)sprintf;
    if (mx_activate_page_symbol && s->code != mx_sprintf_symbol) {
      mx_activate_page_symbol = 0;
      MX_WARN (("mx_activate_page_symbol ignored: sprintf check failed\n"));
    } else {
      func_desc.code = mx_activate_page_symbol;
      func_desc.got = s->got;
      mx_activate_page_symbol = (size_t)&func_desc;
    }
  }
#else
  if (mx_activate_page_symbol && mx_sprintf_symbol != (uintptr_t)sprintf)
    {
      mx_activate_page_symbol = 0;
      MX_WARN (("mx_activate_page_symbol ignored: sprintf check failed\n"));
    }
#endif
  if (mx_activate_page_symbol)
    {
      MX_INFO (("activate_page (0x%lx) used: good\n", mx_activate_page_symbol));
    }
  else
    {
      MX_INFO (("no activate_page, swapping might cause unresponsive state\n"));
    }
#endif

#if LINUX_XX >= 26
  if (mx_linux_class_init())
    return -EBUSY;
  
  if (mx_linux_cdev_init()) {
    mx_linux_class_fini();
    MX_WARN(("register_chrdev failed (other myrinet driver loaded?)\n"));
    return -EBUSY;
  }
#else  

  if (register_chrdev(MX_MAJOR, "mx", &mx_fops) >= 0) {
    MX_DEBUG_PRINT (MX_DEBUG_BOARD_INIT,
		      ("mx: register_chrdev succeeded\n"));
  }
  else {
    MX_WARN(("register_chrdev failed (other myrinet driver loaded?)\n"));
    return -EBUSY;
  }
#endif /* linux 26 */

  mx_devfs_register("mxctl", NULL, MX_CTL);
  mx_devfs_register("mxctlp", NULL, MX_CTLP);

#if (MX_CPU_x86_64 || MX_CPU_powerpc64) && !defined HAVE_COMPAT_IOCTL
  if (mx_linux_register_ioctl32()) {
    MX_WARN (("Failed registering 32bit ioctls. 32bit apps might not work\n"));
  }
#endif /* MX_CPU_x86_64 || MX_CPU_powerpc64 */
  
  /* from now we go through mx_cleanup_module in case of error */

  if (mx_init_driver() != 0) {
    MX_WARN(("Driver Initialization failure\n"));
    mx_cleanup_module();
    return -ENODEV;
  }

  if (pci_module_init(&mx_driver) != 0) {
    mx_cleanup_module();
    MX_WARN (("No board initialized\n"));
    return -ENODEV;
  }

  mx_linux_pci_driver_registered = 1;
  MX_INFO (("%d Myrinet board%s found and initialized\n", 
	      mx_num_instances,
	      mx_num_instances == 1 ? "" : "s" ));
  
#if MX_USE_WATCHDOG_THREAD
  init_completion(&mx_watchdog_completion);
  init_waitqueue_head(&mx_watchdog_queue);
  if (kernel_thread(mx_watchdog_thread, 0, CLONE_FS | CLONE_FILES) < 0) {
    MX_WARN(("Cannot start the watchdog thread: No Parity recovery\n"));
    complete(&mx_watchdog_completion);
  }
#else
  /* setup a timer for the watchdog and start it running */
  init_timer(&mx_watchdog_timer);
  mx_watchdog_timer.expires = jiffies + MX_WATCHDOG_TIMEOUT * HZ;
  mx_watchdog_timer.data = 0;
  mx_watchdog_timer.function = mx_watchdog;
  add_timer(&mx_watchdog_timer);
  MX_LXX_INIT_WORK(&mx_watchdog_work, mx_watchdog_wrapper, NULL);
#endif

#if MX_KERNEL_LIB
  /* initialize the kernel library */
  error = mx_init_klib();
  if (error) {
    MX_WARN (("Kernel Lib Initialization failure\n"));
    mx_cleanup_module();
    return -error;
  }
#endif

  return 0;
}

int
mx_alloc_dma_pages(mx_instance_state_t *is, char **alloc_addr, 
		  char **addr, mx_page_pin_t *pin, int log2)
{
  dma_addr_t dma;
  *alloc_addr = *addr = (char *)__get_free_pages(GFP_KERNEL, log2);
  if (!*addr)
    return ENOMEM;
#if MX_DEBUG
  memset(*addr, -1, (1<<log2)*PAGE_SIZE);
#endif

  dma = pci_map_single(is->arch.pci_dev, *addr, (1 << log2) * PAGE_SIZE,
		       PCI_DMA_BIDIRECTIONAL);
  pin->dma.high = (uint64_t)dma >> 32;
  pin->dma.low = dma;
  if (dma == INVALID_DMA_ADDR) {
    mx_printf_once("Warning:pci_map failure: should be free=%d pages\n",
		   atomic_read(&is->arch.free_iommu_pages));
    free_pages((unsigned long)*addr,log2);
    *alloc_addr = *addr = 0;
    return ENXIO;
  }
  pin->va = (uint64_t)(unsigned long)*addr;
  return 0;
}

int
mx_alloc_dma_page(mx_instance_state_t *is, char **alloc_addr, 
		  char **addr, mx_page_pin_t *pin)
{
  return mx_alloc_dma_pages(is,alloc_addr,addr,pin,0);
}

void
mx_free_dma_pages(mx_instance_state_t *is, char **alloc_addr, mx_page_pin_t *pin, int log2)
{
  dma_addr_t dma = ((uint64_t)pin->dma.high << 32) + pin->dma.low;
  pci_unmap_single(is->arch.pci_dev, dma, (1 << log2) * PAGE_SIZE, PCI_DMA_BIDIRECTIONAL);
  free_pages((unsigned long)*alloc_addr, log2);
  if (MX_DEBUG)
    *alloc_addr = 0;
}

void
mx_free_dma_page(mx_instance_state_t *is, char **alloc_addr, mx_page_pin_t *pin)
{
  mx_free_dma_pages(is,alloc_addr,pin,0);
}

int
mx_optimized_alloc_copyblock(mx_instance_state_t *is, mx_copyblock_t *cb)
{
  char *va;
  int i, status;

  cb->pins = mx_kmalloc(sizeof(cb->pins[0]) * (cb->size / PAGE_SIZE), 
			MX_MZERO|MX_WAITOK);
  if (cb->pins == NULL) {
    MX_WARN(("copyblock pin info allocation failed due to lack of memory\n"));
    status = ENOMEM;
    goto abort_with_nothing;
  }
 
  for (i = 0; i * PAGE_SIZE < cb->size; i++) {
    status = mx_alloc_zeroed_dma_page(is, &va, &va, 
				      &cb->pins[i]);
    if (status)
      goto abort_with_dma_pages;
    mx_reserve_page((void *)va);
  }

  return 0;

 abort_with_dma_pages:
  mx_optimized_free_copyblock(is, cb);

 abort_with_nothing:
  return status;
 
}

uint16_t
mx_bridge_pci_sec_status(mx_instance_state_t *is)
{
  uint16_t pci_status = -1;
  struct pci_dev *bridge;
  
  if (!is || !is->arch.pci_dev || !is->arch.pci_dev->bus->self)
    return 0;
  bridge = is->arch.pci_dev->bus->self;
  
  if (pci_read_config_word(bridge, PCI_SEC_STATUS, &pci_status)) {
    MX_WARN(("Error while reading PCI sec status on %s\n", mx_pci_name(bridge)));
  }
  return pci_status;
}

void
mx_optimized_free_copyblock(mx_instance_state_t *is, mx_copyblock_t *cb)
{
  char *va;
  int i;

  if (cb->pins == NULL)
    return;

  for (i = 0; i < (cb->size / PAGE_SIZE); i++) {
    va = (char *)(unsigned long)cb->pins[i].va;
    if (va != 0) {
      mx_unreserve_page(va);
      mx_free_dma_page(is, &va,  &cb->pins[i]);
    }
  }

  mx_kfree(cb->pins);
  cb->pins = NULL;
}

/* Copy function from/to user space in ANOTHER process */

int
mx_copy_to_user_mm(mx_uaddr_t udst, void * ksrc, struct mm_struct *dst_mm,
		   uint32_t len)
{
  /* this code is inspired from access_process_vm */
  struct page *page;
  int status;

  while (len > 0) {
    uint32_t offset = udst & ~PAGE_MASK;
    uint32_t chunk = MIN(len, PAGE_SIZE - offset);
    char *kdst;

    /* Using "current" rather than "p" is not a typo.  It is done so
       that page faults are charged to the caller.
       Besides, task p might disappear at any time */
    mx_mmap_down_write(dst_mm);
    status = get_user_pages(current, dst_mm, udst, 1, 1, 0, &page, NULL);
    mx_mmap_up_write(dst_mm);
    if (status != 1) {
      return -status;
    }
    kdst = kmap_atomic(page, KM_USER0);
    /* copy_to_user_page should be used instead. But it uses some symbols that
     * are not exported on some architectures...
     */
    memcpy(kdst+offset, ksrc, chunk);
    mx_set_page_dirty_lock(page);
    kunmap_atomic(kdst, KM_USER0);
    put_page(page);
    ksrc += chunk;
    udst += chunk;
    len -= chunk;
  }

  return 0;
}

int
mx_copy_from_user_mm(char *kdst, mx_uaddr_t usrc, struct mm_struct *src_mm,
		     uint32_t len)
{
  /* this code is inspired from access_process_vm */
  struct page *page;
  int status;

  while (len > 0) {
    uint32_t offset = usrc & ~PAGE_MASK;
    uint32_t chunk = MIN(len, PAGE_SIZE - offset);
    char *ksrc;

    /* Using "current" rather than "p" is not a typo.  It is done so
       that page faults are charged to the caller.
       Besides, task p might disappear at any time */
    mx_mmap_down_write(src_mm);
    status = get_user_pages(current, src_mm, usrc, 1, 0, 0, &page, NULL);
    mx_mmap_up_write(src_mm);
    if (status != 1) {
      return -status;
    }
    ksrc = kmap_atomic(page, KM_USER0);
    /* copy_from_user_page should be used instead. But it uses some symbols that
     * are not exported on some architectures...
     */
    memcpy(kdst, ksrc+offset, chunk);
    kunmap_atomic(ksrc, KM_USER0);
    put_page(page);
    usrc += chunk;
    kdst += chunk;
    len -= chunk;
  }

  return 0;
}

/* OS specific callback for direct get, copying from another process
 * user-space to current process user-space.
 */
int
mx_arch_copy_user_to_user(mx_uaddr_t udst,
			  mx_uaddr_t usrc, void * src_space,
			  uint32_t len)
{
  struct mm_struct * mm_src = (struct mm_struct *) src_space;
  struct page *page;
  int status;

  while (len > 0) {
    uint32_t offset = usrc & ~PAGE_MASK;
    uint32_t chunk = MIN(len, PAGE_SIZE - offset);
    char *ksrc;

    /* Using "current" rather than "p" is not a typo.  It is done so
       that page faults are charged to the caller.
       Besides, task p might disappear at any time */
    mx_mmap_down_write(mm_src);
    status = get_user_pages(current, mm_src, usrc, 1, 0, 0, &page, NULL);
    mx_mmap_up_write(mm_src);
    if (status != 1) {
      return -status;
    }
    ksrc = kmap(page);
    status = copy_to_user((void*)udst, ksrc + offset, chunk);
    kunmap(page);
    put_page(page);
    if (status) {
      return EFAULT;
    }
    usrc += chunk;
    udst += chunk;
    len -= chunk;
  }

  return 0;
}

int
mx_direct_get(mx_endpt_state_t *dst_es, mx_shm_seg_t *dst_segs, uint32_t dst_nsegs,
	      mx_endpt_state_t *src_es, mx_shm_seg_t *src_segs, uint32_t src_nsegs,
	      uint32_t length)
{
  struct mm_struct *mm_src;
  struct task_struct *p;
  int status = 0;

  mx_tasklist_lock();
  p = find_task_by_pid(src_es->opener.pid);
  if (!p || !p->mm) {
    mx_tasklist_unlock();
    return ESRCH;
  }
  mm_src = p->mm;
  atomic_inc(&mm_src->mm_count);
  mx_tasklist_unlock();

  /* get destination segments from current mm */
  if (dst_nsegs > 1) {
    mx_uaddr_t uptr = dst_segs[0].vaddr;
    dst_segs = mx_kmalloc(dst_nsegs * sizeof(*dst_segs), 0);
    if (!dst_segs) {
      status = ENOMEM;
      goto abort;
    }
    status = copy_from_user(dst_segs, (void*) uptr, dst_nsegs * sizeof(*dst_segs));
    if (status) {
      status = -status;
      goto abort_with_dst_segs;
    }
  }

  /* get source segments from another mm */
  if (src_nsegs > 1) {
    mx_uaddr_t uptr = src_segs[0].vaddr;

    src_segs = mx_kmalloc(src_nsegs * sizeof(*src_segs), 0);
    if (!src_segs) {
      status = ENOMEM;
      goto abort_with_dst_segs;
    }

    status = mx_copy_from_user_mm((char *) src_segs, uptr, mm_src,
				  src_nsegs * sizeof(*src_segs));
    if (status) {
      goto abort_with_src_segs;
    }
  }

  status = mx_direct_get_common(dst_segs, dst_nsegs,
				mm_src, src_segs, src_nsegs,
				length);

 abort_with_src_segs:
  if (src_nsegs > 1)
    mx_kfree (src_segs);
 abort_with_dst_segs:
  if (dst_nsegs > 1)
    mx_kfree (dst_segs);
 abort:
  if (atomic_dec_and_test(&mm_src->mm_count)) {
    MX_WARN(("src process exited during get: mm leak\n"));
  }
  return status;
}


/****************************************************************
 * PCI config space functions
 ****************************************************************/

#define pcibios_to_mx(rw, size, linuxname, c_type, star)           \
int                                                                  \
mx_##rw##_pci_config_##size (mx_instance_state_t *is,            \
                               uint32_t offset,                      \
                               uint##size##_t star value)            \
{                                                                    \
  mx_assert (is);                                                  \
  mx_assert (is->arch.pci_dev);                                    \
  return (pci_##rw##_config_##linuxname (is->arch.pci_dev,           \
                                          (unsigned char) offset,    \
                                          (c_type star) value));     \
}

pcibios_to_mx (read, 32, dword, unsigned int, *);
pcibios_to_mx (write, 32, dword, unsigned int,);
pcibios_to_mx (read, 16, word, unsigned short, *);
pcibios_to_mx (write, 16, word, unsigned short,);
pcibios_to_mx (read, 8, byte, unsigned char, *);
pcibios_to_mx (write, 8, byte, unsigned char,);

int
mx_pcie_link_reset(mx_instance_state_t *is)
{
  struct pci_dev *bridge = is->arch.pci_dev->bus->self;
  unsigned cap;
  uint8_t link_ctl;
  if (!bridge 
      || (!(cap = pci_find_capability(bridge, PCI_CAP_ID_EXP)))
      || pci_read_config_byte(bridge, cap + PCI_EXP_LNKCTL, &link_ctl) < 0) {
    MX_INFO(("NIC at %s has no PCIE upstream bridge !!\n", mx_pci_name(is->arch.pci_dev)));
    return -1;
  }
  pci_write_config_byte(bridge, cap+PCI_EXP_LNKCTL, link_ctl | PCI_EXP_LNKCTL_DISABLE);
  mx_spin(200000);
  pci_write_config_byte(bridge, cap+PCI_EXP_LNKCTL, link_ctl);
  return 0;
}

void
mx_set_default_hostname(void)
{
  strncpy(mx_default_hostname, mx_current_utsname.nodename, sizeof(mx_default_hostname) - 1);
  mx_default_hostname[sizeof(mx_default_hostname) - 1] = '\0';
}

module_init (mx_init_module);
module_exit (mx_cleanup_module);
